@article{googleNMT,
title	= {Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
author	= {Yonghui Wu and Mike Schuster and Zhifeng Chen and Quoc V. Le and Mohammad Norouzi and Wolfgang Macherey and Maxim Krikun and Yuan Cao and Qin Gao and Klaus Macherey and Jeff Klingner and Apurva Shah and Melvin Johnson and Xiaobing Liu and Łukasz Kaiser and Stephan Gouws and Yoshikiyo Kato and Taku Kudo and Hideto Kazawa and Keith Stevens and George Kurian and Nishant Patil and Wei Wang and Cliff Young and Jason Smith and Jason Riesa and Alex Rudnick and Oriol Vinyals and Greg Corrado and Macduff Hughes and Jeffrey Dean},
year	= {2016},
URL	= {http://arxiv.org/abs/1609.08144},
journal	= {CoRR},
volume	= {abs/1609.08144}
}

@article{mnist,
  added-at = {2010-06-28T21:16:30.000+0200},
  author = {LeCun, Yann and Cortes, Corinna},
  biburl = {https://www.bibsonomy.org/bibtex/2935bad99fa1f65e03c25b315aa3c1032/mhwombat},
  groups = {public},
  howpublished = {http://yann.lecun.com/exdb/mnist/},
  interhash = {21b9d0558bd66279df9452562df6e6f3},
  intrahash = {935bad99fa1f65e03c25b315aa3c1032},
  keywords = {MSc _checked character_recognition mnist network neural},
  lastchecked = {2016-01-14 14:24:11},
  timestamp = {2016-07-12T19:25:30.000+0200},
  title = {{MNIST} handwritten digit database},
  url = {http://yann.lecun.com/exdb/mnist/},
  username = {mhwombat},
  year = 2010
}

@article{bert,
  author    = {Jacob Devlin and
               Ming{-}Wei Chang and
               Kenton Lee and
               Kristina Toutanova},
  title     = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language
               Understanding},
  journal   = {CoRR},
  volume    = {abs/1810.04805},
  year      = {2018},
  url       = {http://arxiv.org/abs/1810.04805},
  archivePrefix = {arXiv},
  eprint    = {1810.04805},
  timestamp = {Tue, 30 Oct 2018 20:39:56 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1810-04805},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DSS,
  author    = {Sang Michael Xie and
               Stefano Ermon},
  title     = {Differentiable Subset Sampling},
  journal   = {CoRR},
  volume    = {abs/1901.10517},
  year      = {2019},
  url       = {http://arxiv.org/abs/1901.10517},
  archivePrefix = {arXiv},
  eprint    = {1901.10517},
  timestamp = {Sun, 03 Feb 2019 00:00:00 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1901-10517},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{NAEC,
  author    = {Kaiyu Chen and
               Yihan Dong and
               Xipeng Qiu and
               Zitian Chen},
  title     = {Neural Arithmetic Expression Calculator},
  journal   = {CoRR},
  volume    = {abs/1809.08590},
  year      = {2018},
  url       = {http://arxiv.org/abs/1809.08590},
  archivePrefix = {arXiv},
  eprint    = {1809.08590},
  timestamp = {Fri, 05 Oct 2018 01:00:00 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1809-08590},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{maep-madsen-johansen-2019,
    author={Anonomous},
    title={Measuring Arithmetic Extrapolation Performance},
    booktitle={Science meets Engineering of Deep Learning at 33rd Conference on Neural Information Processing Systems (NeurIPS 2019)},
    address={Vancouver, Canada},
    journal={CoRR},
    volume={abs/1910.01888},
    month={October},
    year={2019},
    url={http://arxiv.org/abs/1910.01888},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    eprint={1910.01888},
    timestamp={Fri, 4 Oct 2019 12:00:36 UTC}
}
@article{FreivaldsL17,
  author    = {Karlis Freivalds and
               Renars Liepins},
  title     = {Improving the Neural {GPU} Architecture for Algorithm Learning},
  journal   = {CoRR},
  volume    = {abs/1702.08727},
  year      = {2017},
  url       = {http://arxiv.org/abs/1702.08727},
  archivePrefix = {arXiv},
  eprint    = {1702.08727},
  timestamp = {Mon, 13 Aug 2018 16:49:00 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/FreivaldsL17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{NTM,
  author    = {Alex Graves and
               Greg Wayne and
               Ivo Danihelka},
  title     = {Neural Turing Machines},
  journal   = {CoRR},
  volume    = {abs/1410.5401},
  year      = {2014},
  url       = {http://arxiv.org/abs/1410.5401},
  archivePrefix = {arXiv},
  eprint    = {1410.5401},
  timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/GravesWD14},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{openai-learning-dexterous,
  author    = {OpenAI and
               Marcin Andrychowicz and
               Bowen Baker and
               Maciek Chociej and
               Rafal J{\'{o}}zefowicz and
               Bob McGrew and
               Jakub W. Pachocki and
               Jakub Pachocki and
               Arthur Petron and
               Matthias Plappert and
               Glenn Powell and
               Alex Ray and
               Jonas Schneider and
               Szymon Sidor and
               Josh Tobin and
               Peter Welinder and
               Lilian Weng and
               Wojciech Zaremba},
  title     = {Learning Dexterous In-Hand Manipulation},
  journal   = {CoRR},
  volume    = {abs/1808.00177},
  year      = {2018},
  url       = {http://arxiv.org/abs/1808.00177},
  archivePrefix = {arXiv},
  eprint    = {1808.00177},
  timestamp = {Thu, 14 Feb 2019 11:14:23 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1808-00177},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{GridLSTM,
  author    = {Nal Kalchbrenner and
               Ivo Danihelka and
               Alex Graves},
  title     = {Grid Long Short-Term Memory},
  booktitle = {4th International Conference on Learning Representations, {ICLR} 2016,
               San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings},
  year      = {2016},
  url       = {http://arxiv.org/abs/1507.01526},
  timestamp = {Fri, 29 Mar 2019 00:00:00 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/KalchbrennerDG15},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{drewspaper,
  author    = {Drew A. Hudson and
               Christopher D. Manning},
  title     = {{GQA:} a new dataset for compositional question answering over real-world
               images},
  journal   = {CoRR},
  volume    = {abs/1902.09506},
  year      = {2019},
  url       = {http://arxiv.org/abs/1902.09506},
  archivePrefix = {arXiv},
  eprint    = {1902.09506},
  timestamp = {Tue, 21 May 2019 18:03:36 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1902-09506},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{johnson2017clevr,
  title={CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning},
  author={Johnson, Justin and Hariharan, Bharath and van der Maaten, Laurens
          and Fei-Fei, Li and Zitnick, C Lawrence and Girshick, Ross},
  booktitle={CVPR},
  year={2017}
}

@article{naturalquestions,
  title={Natural Questions: a Benchmark for Question Answering Research},
  author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
  year={2019},
  journal={Transactions of the Association of Computational Linguistics}
}

@inproceedings{NeuralGPU,
  author    = {Lukasz Kaiser and
               Ilya Sutskever},
  title     = {Neural GPUs Learn Algorithms},
  booktitle = {4th International Conference on Learning Representations, {ICLR} 2016,
               San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings},
  year      = {2016},
  url       = {http://arxiv.org/abs/1511.08228},
  timestamp = {Fri, 29 Mar 2019 00:00:00 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/KaiserS15},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{resnet,
  author    = {Kaiming He and
               Xiangyu Zhang and
               Shaoqing Ren and
               Jian Sun},
  title     = {Deep Residual Learning for Image Recognition},
  booktitle = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
               {CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
  pages     = {770--778},
  year      = {2016},
  url       = {https://doi.org/10.1109/CVPR.2016.90},
  doi       = {10.1109/CVPR.2016.90},
  timestamp = {Wed, 17 Apr 2019 01:00:00 +0200},
  biburl    = {https://dblp.org/rec/bib/conf/cvpr/HeZRS16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@InProceedings{suzgun2019evaluating,
  title={On Evaluating the Generalization of LSTM Models in Formal Languages},
  author={Suzgun, Mirac and Belinkov, Yonatan and Shieber, Stuart M.},
  booktitle={Proceedings of the Society for Computation in Linguistics (SCiL)},
  pages={277--286},
  year={2019},
  month={January}
}
@inproceedings{stillNotSystematic,
  author    = {Brenden M. Lake and
               Marco Baroni},
  title     = {Generalization without Systematicity: On the Compositional Skills
               of Sequence-to-Sequence Recurrent Networks},
  booktitle = {Proceedings of the 35th International Conference on Machine Learning,
               {ICML} 2018, Stockholmsm{\"{a}}ssan, Stockholm, Sweden, July
               10-15, 2018},
  pages     = {2879--2888},
  year      = {2018},
  url       = {http://proceedings.mlr.press/v80/lake18a.html},
  timestamp = {Wed, 03 Apr 2019 18:17:30 +0200},
  biburl    = {https://dblp.org/rec/bib/conf/icml/LakeB18},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@INPROCEEDINGS{adam-optimization,
       author = {{Kingma}, Diederik P. and {Ba}, Jimmy},
        title = "{Adam: A Method for Stochastic Optimization}",
      journal = {arXiv e-prints},
     keywords = {Computer Science - Machine Learning},
         year = "2014",
        month = "Dec",
          eid = {arXiv:1412.6980},
        pages = {arXiv:1412.6980},
archivePrefix = {arXiv},
       eprint = {1412.6980},
 primaryClass = {cs.LG},
 booktitle = {The 3rd International Conference for Learning Representations, San Diego, 2015}
}

@INPROCEEDINGS{glorot-initialization,
     author = {Glorot, Xavier and Bengio, Yoshua},
      month = may,
      title = {Understanding the difficulty of training deep feedforward neural networks},
  booktitle = {JMLR W\&CP: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2010)},
     volume = {9},
       year = {2010},
      pages = {249-256},
   location = {Chia Laguna Resort, Sardinia, Italy},
   abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.}
}


@article{natureGo,
  author    = {David Silver and
               Aja Huang and
               Chris J. Maddison and
               Arthur Guez and
               Laurent Sifre and
               George van den Driessche and
               Julian Schrittwieser and
               Ioannis Antonoglou and
               Vedavyas Panneershelvam and
               Marc Lanctot and
               Sander Dieleman and
               Dominik Grewe and
               John Nham and
               Nal Kalchbrenner and
               Ilya Sutskever and
               Timothy P. Lillicrap and
               Madeleine Leach and
               Koray Kavukcuoglu and
               Thore Graepel and
               Demis Hassabis},
  title     = {Mastering the game of Go with deep neural networks and tree search},
  journal   = {Nature},
  volume    = {529},
  number    = {7587},
  pages     = {484--489},
  year      = {2016},
  url       = {https://doi.org/10.1038/nature16961},
  doi       = {10.1038/nature16961},
  timestamp = {Wed, 14 Nov 2018 00:00:00 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/nature/SilverHMGSDSAPL16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{nieder-neuronal-number,
    Author = {Nieder, Andreas},
	Date = {2016/05/06/online},
	Date-Added = {2019-05-02 13:12:18 +0000},
	Date-Modified = {2019-05-02 13:12:18 +0000},
	Day = {06},
	Journal = {Nature Reviews Neuroscience},
	L3 = {10.1038/nrn.2016.40; },
	M3 = {Review Article},
	Month = {05},
	Pages = {366 EP  -},
	Publisher = {Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved. SN  -},
	Title = {The neuronal code for number},
	Ty = {JOUR},
	Url = {https://doi.org/10.1038/nrn.2016.40},
	Volume = {17},
	Year = {2016},
	Bdsk-Url-1 = {https://doi.org/10.1038/nrn.2016.40}}

@article{rugani-arithmetic-chicks,
author = {Rosa Rugani and Laura Fontanari and Eleonora Simoni and Lucia Regolin and Giorgio Vallortigara},
title = {Arithmetic in newborn chicks},
journal = {Proceedings of the Royal Society B: Biological Sciences},
volume = {276},
number = {1666},
pages = {2451-2460},
year = {2009},
doi = {10.1098/rspb.2009.0044},
URL = {https://royalsocietypublishing.org/doi/abs/10.1098/rspb.2009.0044},
eprint = {https://royalsocietypublishing.org/doi/pdf/10.1098/rspb.2009.0044},
abstract = { Newly hatched domestic chicks were reared with five identical objects. On days 3 or 4, chicks underwent free-choice tests in which sets of three and two of the five original objects disappeared (either simultaneously or one by one), each behind one of two opaque identical screens. Chicks spontaneously inspected the screen occluding the larger set (experiment 1). Results were confirmed under conditions controlling for continuous variables (total surface area or contour length; experiment 2). In the third experiment, after the initial disappearance of the two sets (first event, FE), some of the objects were visibly transferred, one by one, from one screen to the other (second event, SE). Thus, computation of a series of subsequent additions or subtractions of elements that appeared and disappeared, one by one, was needed in order to perform the task successfully. Chicks spontaneously chose the screen, hiding the larger number of elements at the end of the SE, irrespective of the directional cues provided by the initial (FE) and final (SE) displacements. Results suggest impressive proto-arithmetic capacities in the young and relatively inexperienced chicks of this precocial species. }
}

@article{gallistel-numbers-in-brain,
author = {C. R. Gallistel },
title = {Finding numbers in the brain},
journal = {Philosophical Transactions of the Royal Society B: Biological Sciences},
volume = {373},
number = {1740},
pages = {20170119},
year = {2018},
doi = {10.1098/rstb.2017.0119},
URL = {https://royalsocietypublishing.org/doi/abs/10.1098/rstb.2017.0119},
eprint = {https://royalsocietypublishing.org/doi/pdf/10.1098/rstb.2017.0119},
abstract = { After listing functional constraints on what numbers in the brain must do, I sketch the two's complement fixed-point representation of numbers because it has stood the test of time and because it illustrates the non-obvious ways in which an effective coding scheme may operate. I briefly consider its neurobiological implementation. It is easier to imagine its implementation at the cell-intrinsic molecular level, with thermodynamically stable, volumetrically minimal polynucleotides encoding the remembered numbers, than at the circuit level, with plastic synapses encoding them. This article is part of a discussion meeting issue âThe origins of numerical abilitiesâ. }
}

@article{rumelhart1986learning,
  added-at = {2018-06-03T13:17:55.000+0200},
  author = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.},
  biburl = {https://www.bibsonomy.org/bibtex/25d95851c0f627ab11747a2e481ecbad6/achakraborty},
  description = {Learning representations by back-propagating errors | Nature},
  interhash = {c354bc293fa9aa7caffc66d40a014903},
  intrahash = {5d95851c0f627ab11747a2e481ecbad6},
  journal = {Nature},
  keywords = {deep-learning nature neural-networks paper},
  month = oct,
  pages = {533--},
  publisher = {Nature Publishing Group},
  timestamp = {2018-06-03T13:17:55.000+0200},
  title = {Learning representations by back-propagating errors},
  url = {http://dx.doi.org/10.1038/323533a0},
  volume = 323,
  year = 1986
}



@incollection{trask-nalu,
title = {Neural Arithmetic Logic Units},
author = {Trask, Andrew and Hill, Felix and Reed, Scott E and Rae, Jack and Dyer, Chris and Blunsom, Phil},
booktitle = {Advances in Neural Information Processing Systems 31},
editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
pages = {8035--8044},
year = {2018},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/8027-neural-arithmetic-logic-units.pdf}
}

@article{wilson-binomial,
author = { Edwin B.   Wilson },
title = {Probable Inference, the Law of Succession, and Statistical Inference},
journal = {Journal of the American Statistical Association},
volume = {22},
number = {158},
pages = {209-212},
year  = {1927},
publisher = {Taylor & Francis},
doi = {10.1080/01621459.1927.10502953},

URL = { 
        https://www.tandfonline.com/doi/abs/10.1080/01621459.1927.10502953
    
},
eprint = { 
        https://www.tandfonline.com/doi/pdf/10.1080/01621459.1927.10502953
    
}

}